{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Neural Collaborative Filtering\n",
    "\n",
    "论文作者在 github 开源了 NCF 的实现，[hexiangnan/neural_collaborative_filtering](https://github.com/hexiangnan/neural_collaborative_filtering)，下面的代码中模型部分大量借鉴了原作的实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理\n",
    "\n",
    "这里使用的是 MovieLens 100k 数据集合。可以在此下载：http://files.grouplens.org/datasets/movielens/ml-100k.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>196</td>\n",
       "      <td>242</td>\n",
       "      <td>3</td>\n",
       "      <td>881250949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>186</td>\n",
       "      <td>302</td>\n",
       "      <td>3</td>\n",
       "      <td>891717742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>22</td>\n",
       "      <td>377</td>\n",
       "      <td>1</td>\n",
       "      <td>878887116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>244</td>\n",
       "      <td>51</td>\n",
       "      <td>2</td>\n",
       "      <td>880606923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "      <td>886397596</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating  timestamp\n",
       "0      196       242       3  881250949\n",
       "1      186       302       3  891717742\n",
       "2       22       377       1  878887116\n",
       "3      244        51       2  880606923\n",
       "4      166       346       1  886397596"
      ]
     },
     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
    "ratings = pd.read_csv('./data/ml-100k/u.data', sep='\\t', names=r_cols, encoding='latin-1')\n",
    "ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings = ratings.drop(['timestamp'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>196</td>\n",
       "      <td>242</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>186</td>\n",
       "      <td>302</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>22</td>\n",
       "      <td>377</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>244</td>\n",
       "      <td>51</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating\n",
       "0      196       242       3\n",
       "1      186       302       3\n",
       "2       22       377       1\n",
       "3      244        51       2\n",
       "4      166       346       1"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "user_id = ratings['user_id']\n",
    "ratings_train, ratings_test, _, _ = train_test_split(ratings, user_id, test_size=0.25, stratify=user_id, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 273,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((75000, 3), (25000, 3))"
      ]
     },
     "execution_count": 273,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings_train.shape, ratings_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构建数据集\n",
    "\n",
    "样本集就是 `item, user, label` 这样的元组，user 和 item 是输入，label 是输出。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_instances_from_df(ratings, num_negative=4):\n",
    "    # 正例\n",
    "    users = ratings['user_id'].values\n",
    "    items = ratings['movie_id'].values\n",
    "    labels = np.ones_like(users)\n",
    "    \n",
    "    user_item_pairs = set(zip(users, items))\n",
    "    \n",
    "    negative_users = []\n",
    "    negative_items = []\n",
    "    \n",
    "    # 负例\n",
    "    for user in users:\n",
    "        i = 0\n",
    "        while i < num_negative:\n",
    "            item = np.random.choice(items)\n",
    "            if (user, item) not in user_item_pairs:\n",
    "                i += 1\n",
    "                negative_users.append(user)\n",
    "                negative_items.append(item)\n",
    "                \n",
    "    \n",
    "    users = np.r_[users, negative_users]\n",
    "    items = np.r_[items, negative_items]\n",
    "    labels = np.r_[labels, np.zeros_like(negative_users)]\n",
    "    \n",
    "    index = np.random.permutation(users.shape[0])\n",
    "\n",
    "    users = users[index]\n",
    "    items = items[index]\n",
    "    labels = labels[index]\n",
    "    \n",
    "    return users, items, labels\n",
    "    \n",
    "users, items, labels = build_instances_from_df(ratings_train)\n",
    "users_test, items_test, labels_test = build_instances_from_df(ratings_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generalized Matrix Factorization (GMF)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow.keras as keras\n",
    "from tensorflow.keras.models import Sequential, Model\n",
    "from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply\n",
    "from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop\n",
    "\n",
    "def build_GMF_model(num_users, num_items, latent_dim):\n",
    "    user_input = Input(shape=(1,), dtype='int32', name='user_input')\n",
    "    item_input = Input(shape=(1,), dtype='int32', name='item_input')\n",
    "\n",
    "    user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,\n",
    "                               input_length=1, name='user_embedding')\n",
    "    item_embedding = Embedding(input_dim = num_items, output_dim=latent_dim,\n",
    "                               input_length=1, name='item_embedding')   \n",
    "    \n",
    "    user_latent = Flatten()(user_embedding(user_input))\n",
    "    item_latent = Flatten()(item_embedding(item_input))\n",
    "    \n",
    "    # Element-wise product\n",
    "    predict_vector = Multiply()([user_latent, item_latent])\n",
    "    \n",
    "    # Final prediction layer\n",
    "    prediction = Dense(1, activation='sigmoid',\n",
    "                       kernel_initializer=keras.initializers.lecun_uniform(),\n",
    "                       name='prediction')(predict_vector)\n",
    "    \n",
    "    model = Model(inputs=[user_input, item_input], outputs=prediction)\n",
    "\n",
    "    return model\n",
    "\n",
    "num_users = ratings['user_id'].unique().shape[0] + 1\n",
    "num_items = ratings['movie_id'].unique().shape[0] + 1\n",
    "learning_rate = 0.001\n",
    "\n",
    "GMF_model = build_model(num_users, num_items, 16)\n",
    "GMF_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 300000 samples, validate on 75000 samples\n",
      "Epoch 1/5\n",
      "300000/300000 [==============================] - 23s 76us/sample - loss: 0.4974 - acc: 0.8003 - val_loss: 0.4585 - val_acc: 0.8016\n",
      "Epoch 2/5\n",
      "300000/300000 [==============================] - 23s 77us/sample - loss: 0.4218 - acc: 0.8126 - val_loss: 0.4300 - val_acc: 0.8095\n",
      "Epoch 3/5\n",
      "300000/300000 [==============================] - 24s 81us/sample - loss: 0.3848 - acc: 0.8289 - val_loss: 0.4257 - val_acc: 0.8114\n",
      "Epoch 4/5\n",
      "300000/300000 [==============================] - 23s 75us/sample - loss: 0.3636 - acc: 0.8395 - val_loss: 0.4289 - val_acc: 0.8116\n",
      "Epoch 5/5\n",
      "300000/300000 [==============================] - 23s 78us/sample - loss: 0.3508 - acc: 0.8467 - val_loss: 0.4341 - val_acc: 0.8109\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7f9cd4357080>"
      ]
     },
     "execution_count": 284,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GMF_model.fit([np.array(users), np.array(items)],\n",
    "              np.array(labels), epochs=5,\n",
    "              batch_size=32, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "125000/125000 [==============================] - 5s 37us/sample - loss: 0.5451 - acc: 0.7418\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.5451006612052918, 0.741792]"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GMF_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multi-Layer Perceptron (MLP)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_mlp_model(num_users, num_items, layer_units=[20,10], reg_layers=[0,0]):\n",
    "\n",
    "    user_input = Input(shape=(1,), dtype='int32', name='user_input')\n",
    "    item_input = Input(shape=(1,), dtype='int32', name='item_input')\n",
    "\n",
    "    user_embedding = Embedding(input_dim=num_users, output_dim=layer_units[0]//2, name='user_embedding',\n",
    "                               input_length=1)\n",
    "    item_embedding = Embedding(input_dim=num_items, output_dim=layer_units[0]//2, name='item_embedding',\n",
    "                                input_length=1)   \n",
    "    \n",
    "    # Crucial to flatten an embedding vector!\n",
    "    user_latent = Flatten()(user_embedding(user_input))\n",
    "    item_latent = Flatten()(item_embedding(item_input))\n",
    "    \n",
    "    # The 0-th layer is the concatenation of embedding layers\n",
    "    vector = keras.layers.Concatenate(axis=-1)([user_latent, item_latent])\n",
    "    \n",
    "    for i, unit in enumerate(layer_units):\n",
    "        layer = Dense(unit, activation='relu', name='layer_{}'.format(i))\n",
    "        vector = layer(vector)\n",
    "        \n",
    "    # Final prediction layer\n",
    "    prediction = Dense(1, activation='sigmoid',\n",
    "                       kernel_initializer=keras.initializers.lecun_uniform(seed=None),\n",
    "                       name='prediction')(vector)\n",
    "    \n",
    "    model = Model(inputs=[user_input, item_input], outputs=prediction)\n",
    "    \n",
    "    return model\n",
    "\n",
    "mlp_model = build_mlp_model(num_users, num_items)\n",
    "mlp_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 300000 samples, validate on 75000 samples\n",
      "Epoch 1/3\n",
      "300000/300000 [==============================] - 48s 159us/sample - loss: 0.4958 - acc: 0.8000 - val_loss: 0.4796 - val_acc: 0.8004\n",
      "Epoch 2/3\n",
      "300000/300000 [==============================] - 49s 162us/sample - loss: 0.4617 - acc: 0.8033 - val_loss: 0.4573 - val_acc: 0.8053\n",
      "Epoch 3/3\n",
      "300000/300000 [==============================] - 50s 167us/sample - loss: 0.4402 - acc: 0.8069 - val_loss: 0.4415 - val_acc: 0.8064\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7f9bf9636cf8>"
      ]
     },
     "execution_count": 278,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlp_model.fit([np.array(users), np.array(items)],\n",
    "              np.array(labels), epochs=3,\n",
    "              batch_size=16, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "125000/125000 [==============================] - 4s 35us/sample - loss: 0.4630 - acc: 0.7960\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.46300480207824707, 0.796016]"
      ]
     },
     "execution_count": 279,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mlp_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural matrix factorization model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_NeuMF_model(num_users, num_items, mf_dim=16, layer_units=[20]):\n",
    "    user_input = Input(shape=(1,), dtype='int32', name='user_input')\n",
    "    item_input = Input(shape=(1,), dtype='int32', name='item_input')\n",
    "\n",
    "    mf_user_embedding = Embedding(input_dim=num_users, output_dim=mf_dim,\n",
    "                                  name='mf_user_embedding', input_length=1)\n",
    "    mf_item_embedding = Embedding(input_dim=num_items, output_dim=mf_dim,\n",
    "                                  name='mf_item_embedding', input_length=1)\n",
    "    \n",
    "    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=layer_units[0]//2,\n",
    "                                   name='mlp_user_embedding', input_length=1)\n",
    "    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=layer_units[0]//2,\n",
    "                                   name='mlp_item_embedding', input_length=1)   \n",
    "    \n",
    "    # MF part\n",
    "    mf_user_latent = Flatten()(mf_user_embedding(user_input))\n",
    "    mf_item_latent = Flatten()(mf_item_embedding(item_input))\n",
    "    mf_vector = Multiply()([mf_user_latent, mf_item_latent])\n",
    "\n",
    "    # MLP part \n",
    "    mlp_user_latent = Flatten()(mlp_user_embedding(user_input))\n",
    "    mlp_item_latent = Flatten()(mlp_item_embedding(item_input))\n",
    "    mlp_vector = keras.layers.Concatenate(axis=-1)([mlp_user_latent, mlp_item_latent])\n",
    "    \n",
    "    for i, unit in enumerate(layer_units):\n",
    "        layer = Dense(unit, activation='relu', name='layer_{}'.format(i))\n",
    "        mlp_vector = layer(mlp_vector)\n",
    "\n",
    "    predict_vector = keras.layers.Concatenate(axis=-1)([mf_vector, mlp_vector])\n",
    "    \n",
    "    prediction = Dense(1, activation='sigmoid',\n",
    "                       kernel_initializer=keras.initializers.lecun_uniform(seed=None),\n",
    "                       name = \"prediction\")(predict_vector)\n",
    "    \n",
    "    model = Model(inputs=[user_input, item_input], outputs=prediction)\n",
    "    \n",
    "    return model\n",
    "\n",
    "NeuMF_model = build_NeuMF_model(num_users, num_items)\n",
    "NeuMF_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 300000 samples, validate on 75000 samples\n",
      "Epoch 1/3\n",
      "300000/300000 [==============================] - 56s 185us/sample - loss: 0.4785 - acc: 0.8009 - val_loss: 0.4477 - val_acc: 0.8031\n",
      "Epoch 2/3\n",
      "300000/300000 [==============================] - 54s 180us/sample - loss: 0.4101 - acc: 0.8165 - val_loss: 0.4283 - val_acc: 0.8075\n",
      "Epoch 3/3\n",
      "300000/300000 [==============================] - 53s 178us/sample - loss: 0.3717 - acc: 0.8348 - val_loss: 0.4259 - val_acc: 0.8107\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7f9be3f279b0>"
      ]
     },
     "execution_count": 287,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "NeuMF_model.fit([np.array(users), np.array(items)],\n",
    "              np.array(labels), epochs=3,\n",
    "              batch_size=16, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "125000/125000 [==============================] - 5s 37us/sample - loss: 0.5358 - acc: 0.7434\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0.5358014254379272, 0.74336]"
      ]
     },
     "execution_count": 288,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "NeuMF_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
